home *** CD-ROM | disk | FTP | other *** search
- #include <stdio.h>
- #include "thread.h"
- #include "synch.h"
- #include <usclkc.h>
- #include "quartzcommon.h"
- #include "profile.h"
- #include "internal.h"
-
- #ifndef InitOverflowSize
- #define InitOverflowSize 5000
- #endif
-
- #define InitNumObjs 10
- #define InitNumKids 10
-
- /* These get set by munge */
-
- private int mNumProcIds = 0;
- private int mEndOfText = 0;
-
-
- /* communication from the computation -> sampling processors */
-
- private int profileOn = FALSE;
-
- shared int effectiveParallelism = 0;
- shared int nominalParallelism = 0;
- shared int profileOver = FALSE;
- shared Processor processorList[NUMPROCS];
-
-
- /* Sampled data -- output to a file */
-
- shared FLOAT timeDiff = 0;
- shared int numSamples = 0;
-
- shared ConcurrentData *procData;
-
- shared GraphEntry *pcTable; /* needed by mcount */
- shared int pcTableSize;
-
-
- /* Needed internally to control profiling */
-
- static shared SpinLock *pcTableLocks;
-
- static shared GraphEntry overflowFirst[InitOverflowSize];
- static shared GraphEntry *overflow;
- static shared SpinLock overflowLock;
- static shared int overflowSize = 0;
- static shared int overflowOccurred = 0;
-
- static shared SynchSamples *objData;
- static shared int objNum = 0;
- static shared SpinLock objLock;
-
- static shared ChildData *kidData;
- static shared int kidNum = 0;
- static shared SpinLock kidLock;
-
- private int iteration = 1;
- private unsigned int tmpStack[InitIdStackSize];
- private int myHit;
-
- static shared int start = FALSE;
- static shared usclk_t startTime;
- static shared endCount;
- static shared SpinLock startLock;
-
- void OutOfRoom();
-
- /* Initialization routines */
-
- /* Init profiling data structures */
- static void OverflowSetup (o)
- GraphEntry *o;
- {
- GraphTableInit(o, InitOverflowSize);
- overflow = o;
- overflowSize = InitOverflowSize;
- overflowOccurred = 0;
- }
-
- void ProcessorListInit ()
- {
- int i;
- Thread *t;
- Processor *p;
-
- for (i = 0; i < NUMPROCS; i++)
- {
- p = &processorList[i];
-
- t = &p->idleThread;
- t->type = ThreadType;
- t->idStack.base = &p->idStack[0];
- if (i == 0)
- t->idStack.base->procID = StartID | BusyState;
- else
- t->idStack.base->procID = StartID | SpinState;
- t->idStack.top = t->idStack.base;
- t->idStack.limit = t->idStack.base + InitIdStackSize - 1;
-
- p->curThread = t;
- p->synchList = NULL;
- SLNPInit(&p->profLock);
- p->numSamples = 0;
- }
- }
-
- /* External entry point to initialize external data structures */
- void ProfileInit (numProfilers)
- int numProfilers;
- {
- int i;
-
- if (mEndOfText == 0)
- {
- fprintf(stderr, "Unable to profile: munge not run on object\n");
- exit(0);
- }
-
- pcTableSize = (mEndOfText + sizeof(GraphEntry)) / sizeof(GraphEntry);
- pcTable = MyShmalloc(GraphEntry, pcTableSize);
- GraphTableInit(pcTable, pcTableSize);
- pcTableLocks = MyShmalloc(SpinLock, pcTableSize);
- for (i = 0; i < pcTableSize; i++)
- SLNPInit(&pcTableLocks[i]);
-
- OverflowSetup(overflowFirst);
- SLNPInit(&overflowLock);
-
- procData = MyShmalloc(ConcurrentData, mNumProcIds);
- ConDataTableInit(procData, mNumProcIds);
-
- SLNPInit(&objLock);
- SLNPInit(&kidLock);
-
- endCount = numProfilers;
- SLNPInit(&startLock);
- effectiveParallelism = nominalParallelism = 1;
- }
-
- void ProfileSetAllBusy ()
- {
- int i;
-
- for (i = 0; i < numProcessors; i++)
- {
- processorList[i].idleThread.idStack.top++;
- processorList[i].idleThread.idStack.top->procID = ForkID | BusyState;
- }
- effectiveParallelism = nominalParallelism = numProcessors;
- }
-
- void SetProfileOn ()
- {
- profileOn = TRUE;
- }
- void SetProfileOff ()
- {
- profileOn = FALSE;
- }
-
- /* Runtime profiling routines, for normal processors (eg, mcount) */
-
- /* does the same thing as mcount, but ignore recursion */
- void TPushOnIdStack (t, p, s)
- register Thread *t;
- register SynchProfile *p;
- unsigned int s;
- {
- register IdStackEntry *ePtr;
- unsigned int callerID;
-
- ASSERT(t->type == ThreadType);
- ASSERT(p->type == SynchProfileType);
- ePtr = t->idStack.top;
- ePtr->procID |= OverheadState;
- callerID = ePtr->procID & AllOffMask;
- if (p->g.callerID == callerID)
- AtomicIncrP(&(p->g.num));
- else
- ProfileMustAdd((unsigned int)p, callerID, &p->g);
-
- if (ePtr >= t->idStack.limit)
- OutOfRoom(); /* die */
-
- (ePtr + 1)->procID = NoID;
- ePtr->procID &= OverheadOffMask;
- t->idStack.top = ++ePtr;
- ePtr->procID = (s) | (int)p;
- }
-
- void CallAndReplaceOnIdStack (p, s)
- register SynchProfile *p;
- unsigned int s;
- {
- register Thread *t = pP.thread;
- register IdStackEntry *ePtr = t->idStack.top;
- unsigned int callerID;
-
- ASSERT(p->type == SynchProfileType);
- ePtr->procID |= OverheadState;
- callerID = ePtr->procID & AllOffMask;
- if (p->g.callerID == callerID)
- AtomicIncrP(&(p->g.num));
- else
- ProfileMustAdd((unsigned int)p, callerID, &p->g);
- ReplaceOnIdStack(p,s);
- }
-
-
- /* nasty: how to make sure we're free from deadlock
- * on overflow, provided the procedures we call don't overflow
- * on interrupts
- */
-
- void ProfileMustAdd (calleeID, callerID, p)
- unsigned int calleeID;
- unsigned int callerID;
- GraphEntry *p;
- {
- register GraphEntry *q;
- register SpinLock *l = NULL;
- register int old;
-
- for (q = p; q->calleeID != calleeID || q->callerID != callerID; q = q->next)
- {
- while (q->next == NULL)
- {
- if (!l)
- l = id2lock(calleeID);
- old = profileOn;
- profileOn = FALSE; /* in case we get an interrupt */
- if (!SLNPTestAndGet(l))
- {
- profileOn = old;
- continue;
- }
- if (q->next != NULL)
- {
- SLNPRelease(l);
- profileOn = old;
- break;
- }
- if (q->num != 0) /* have to get one from overflow */
- {
- SLNPAcquire(&overflowLock);
- if (overflowSize == 0)
- OverflowSetup(MyShmalloc(GraphEntry,InitOverflowSize));
- q->next = &overflow[--overflowSize];
- SLNPRelease(&overflowLock);
- q = q->next;
- }
- q->num = 1;
- q->calleeID = calleeID;
- q->callerID = callerID;
- SLNPRelease(l);
- profileOn = old;
- return;
- }
- }
- AtomicIncrP(&(q->num));
- }
-
- /* Runtime sampling routines */
-
- /* return t1 - t2 */
- static FLOAT ComputeDiff (t1, t2)
- usclk_t t1, t2;
- {
- usclk_t d;
-
- if (t1 < t2)
- d = t1 + (0xffffffff - t2);
- else
- d = t1 - t2;
- return((FLOAT)d);
- }
-
- static int Bound (n, lb, ub)
- int n, lb, ub;
- {
- if (n < lb)
- return(lb);
- if (n > ub)
- return(ub);
- return(n);
- }
-
- static int SampleStack (t, eff, nom)
- register Thread *t;
- int *eff, *nom;
- {
- int eP, nP;
- register IdStackEntry *e;
- register unsigned int *sp = tmpStack;
-
- /* Sampling begins */
- eP = effectiveParallelism;
- nP = nominalParallelism;
-
- for (e = t->idStack.top; e >= t->idStack.base; e--, sp++)
- *sp = e->procID;
- /* Sampling ends */
-
- *eff = Bound(eP, 1, MaxEffectiveParallelism) - 1;
- *nom = (nP < numProcessors) ? 0 : 1;
- iteration++;
- return(sp - tmpStack);
- }
-
- ChildData *GetChildData ()
- {
- ChildData *k;
-
- SLNPAcquire(&kidLock);
- if (--kidNum < 0)
- {
- kidData = MyShmalloc(ChildData, InitNumKids);
- ChildTableInit(kidData, InitNumKids);
- kidNum = InitNumKids - 1;
- }
- k = &kidData[kidNum];
- SLNPRelease(&kidLock);
- return(k);
- }
-
- SynchSamples *GetSampleSpace ()
- {
- SynchSamples *s;
-
- SLNPAcquire(&objLock);
- if (--objNum < 0)
- {
- objData = MyShmalloc(SynchSamples, InitNumObjs);
- SynchTableInit(objData, InitNumObjs);
- objNum = InitNumObjs - 1;
- }
- s = &objData[objNum];
- SLNPRelease(&objLock);
- return(s);
- }
-
- static void AddSample (data, diff, eP, nP, first, callee)
- ConcurrentData *data;
- FLOAT diff;
- int eP, nP, first;
- unsigned int callee;
- {
- ChildData *k;
-
- ASSERT(data->type == ConcurrentDataType && diff >= 0);
- ASSERT((eP >= 0 && eP < MaxEffectiveParallelism) && (nP == 0 || nP == 1));
- if (data->hit[myHit] < iteration)
- {
- data->hit[myHit] = iteration;
- SLNPAcquire(&data->lock);
- data->nom.byNomP[MePlusKids][BUSY][nP] += diff;
- if (first)
- {
- data->busy.byEffP[eP] += diff;
- data->nom.byNomP[JustMe][BUSY][nP] += diff;
- }
- else /* mark where busy time came from */
- {
- for (k = &data->kid; k->calleeID != callee; k = k->next)
- if (k->next == NULL)
- {
- if (k->calleeID != NoID)
- {
- k->next = GetChildData();
- k = k->next;
- }
- k->calleeID = callee;
- break;
- }
- k->busy.byEffP[eP] += diff;
- }
- SLNPRelease(&data->lock);
- }
- }
-
- static void AddNomSample (data, diff, nP, state, first)
- ConcurrentData *data;
- FLOAT diff;
- int nP, state, first;
- {
- ASSERT(data->type == ConcurrentDataType && (nP == 0 || nP == 1));
- ASSERT((state >= 0 || state < NumStates) && diff >= 0);
- if (data->hit[myHit] < iteration)
- {
- data->hit[myHit] = iteration;
- SLNPAcquire(&data->lock);
- if (first)
- data->nom.byNomP[JustMe][state][nP] += diff;
- data->nom.byNomP[MePlusKids][state][nP] += diff;
- SLNPRelease(&data->lock);
- }
- }
-
- static ConcurrentData *id2data (id)
- unsigned int id;
- {
- SynchProfile *p;
-
- id &= AllOffMask;
- if (isSynchID(id))
- {
- p = (SynchProfile *)id;
- ASSERT(p->type == SynchProfileType);
- if (p->samples == NULL)
- p->samples = GetSampleSpace();
- return(&p->samples->data);
- }
- return(&procData[id]);
- }
-
- static usclk_t ProfileProc (p)
- Processor *p;
- {
- int i, stackDepth, eP, nP;
- usclk_t next;
- FLOAT diff;
-
- next = GETUSCLK();
- stackDepth = SampleStack(p->curThread, &eP, &nP);
- diff = ComputeDiff(next, p->lastSample);
- p->lastSample = next;
-
- if (stackDepth == 0 || isOverhead(tmpStack[0]))
- AddSample(&procData[NoID], diff, eP, nP, TRUE, NoID);
- else if (isSpinning(tmpStack[0]))
- for (i = 0; i < stackDepth; i++)
- AddNomSample(id2data(tmpStack[i]), diff, nP, SPIN, i == 0);
- else
- {
- AddSample(id2data(tmpStack[0]), diff, eP, nP, TRUE, NoID);
- for (i = stackDepth - 1; i > 0; i--)
- AddSample(id2data(tmpStack[i]), diff, eP, nP, FALSE,
- (unsigned int)(tmpStack[i-1] & AllOffMask));
- }
- }
-
- static void ProfileSynch (p)
- SynchProfile *p;
- {
- int i, n[NumNumbers];
- usclk_t next;
- FLOAT diff;
- register Thread *t;
- int stackDepth, eP, nP, type;
-
- if (p->status != ACTIVE)
- return;
- for (i = 0; i < NumNumbers; i++)
- n[i] = p->number[i];
- next = GETUSCLK();
- if (t = p->thread)
- stackDepth = SampleStack(t, &eP, &nP);
- diff = ComputeDiff(next, p->lastSample);
- if (t && stackDepth != 0 && !isOverhead(tmpStack[0]) && !isBusy(tmpStack[0])
- && !isSpinning(tmpStack[0]))
- {
- if (isBlocked(tmpStack[0]))
- type = BLOCKED;
- else
- type = READY;
- for (i = 0; i < stackDepth; i++)
- AddNomSample(id2data(tmpStack[i]), diff, nP, type, i == 0);
- }
- if (!p->samples)
- p->samples = GetSampleSpace();
- for (i = 0; i < NumNumbers; i++)
- {
- n[i] = Bound(n[i], 0, MaxNominalParallelism - 1);
- p->samples->queue.length[i][n[i]] += diff;
- }
- p->lastSample = next;
- }
-
- void ProfileExternal ()
- {
- register int i;
- register Processor *p;
- register SynchProfile *s;
-
- SLNPAcquire(&startLock);
- if (!start)
- {
- start = TRUE;
- startTime = GETUSCLK();
- for (i = 0; i < numProcessors; i++)
- processorList[i].lastSample = startTime;
- }
- SLNPRelease(&startLock);
- myHit = pP.myId - numProcessors;
-
- while (!profileOver)
- {
- for (i = 0; i < numProcessors && !profileOver; i++)
- {
- p = &processorList[i];
- if (SLNPTestAndGet(&p->profLock))
- {
- ProfileProc(p);
- for (s = p->synchList; s && !profileOver; s = s->next)
- ProfileSynch(s);
- p->numSamples++;
- SLNPRelease(&p->profLock);
- }
- }
- }
- SLNPAcquire(&startLock);
- if (--endCount == 0) /* wait for everybody to check in */
- {
- #ifndef DEBUG
- KillAll();
- #endif
- for (i = 0; i < numProcessors; i++)
- {
- timeDiff += ComputeDiff(processorList[i].lastSample, startTime);
- numSamples += processorList[i].numSamples;
- }
- timeDiff /= numProcessors;
- numSamples /= numProcessors;
- DumpInfo();
- #ifdef DEBUG
- KillAll();
- #endif
- }
- SLNPRelease(&startLock);
- exit(0);
- }
-
- void TooHigh ()
- {
- printf("Fatal error: mcount() passed an out-of-bound processor ID.\n");
- fflush(stdout);
- KillAll();
- exit(1);
- }
-
- void OutOfRoom ()
- {
- printf("Fatal error: profiler ID stack is out of room.\n");
- fflush(stdout);
- KillAll();
- exit(1);
- }
-
- void ProfileFinish ()
- {
- profileOver = TRUE;
- }
-
-